rm(list=ls())
.generate_lollipop <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(c(1,2), 1)
# generate "ball"
if(k == 1){
x <- stats::rnorm(1, mean = 0, sd = 1)
y <- stats::rnorm(1, mean = 0, sd = 1)
} else {
# k == 2, generate "stick"
x <- stats::runif(1, min = -1, max = 5)
y <- x + stats::rnorm(1, mean = 0, sd = 0.5)
}
c(x,y)
}))
}
.generate_v <- function(n){
t(sapply(1:n, function(i){
# generate "cluster" assignment
k <- sample(c(1,2), 1, prob = c(0.7, 0.3))
# generate "stick"
if(k == 1){
x <- stats::runif(1, min = 0, max = 5)
y <- 2*x + stats::rnorm(1, mean = 0, sd = 0.5)
} else {
# k == 2, generate "stick"
x <- stats::runif(1, min = 0, max = 5)
y <- 0.5*x + stats::rnorm(1, mean = 0, sd = 0.1)
}
c(x,y)
}))
}
.generate_local <- function(n){
t(sapply(1:n, function(i){
k <- sample(1:3, 1, prob = c(0.4, 0.4, 0.2))
# generate "left_ball"
if (k == 1){
x <- stats::rnorm(1, mean = -3, sd = 0.5)
y <- stats::rnorm(1, mean = 0, sd = 0.2)
} else if (k == 2) {
# generate "right_ball"
x <- stats::rnorm(1, mean = 3, sd = 0.3)
y <- stats::rnorm(1, mean = 0, sd = 0.2)
} else{
# generate "local correlated data"
x <- stats::runif(1, min = -1.5, max = 1.5)
y <- 0.5*x + stats::rnorm(1, mean = 0.1, sd = 0.1)
}
c(x,y)
}))
}
.generate_quadratic <- function(n){
t(sapply(1:n, function(i){
x <- stats::runif(1, min = -2, max = 2)
y <- x^2 + stats::rnorm(1, mean = 0, sd = 0.25)
c(x,y)
}))
}
.generate_clusters_L <- function(n){
t(sapply(1:n, function(i){
k <- sample(1:4, 1, prob = rep(1/4, 4))
if (k == 1){
x <- stats::rnorm(1, mean = -1, sd = 0.1)
y <- x + stats::rnorm(1, mean = 1, sd = 1)
} else if (k == 2){
x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
y <- stats::rnorm(1, mean = -2, sd = 0.1)
} else if (k == 3){
x <- stats::rnorm(1, mean = -1, sd = 0.1)
y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
} else {
x <- stats::rnorm(1, mean = -.5, sd = 0.2)
y <- stats::rnorm(1, mean = -2, sd = 0.1)
}
c(x,y)
}))
}
.generate_clusters1 <- function(n){
t(sapply(1:n, function(i){
k <- sample(1:4, 1, prob = seq(0.25, 1, by = 0.25))
if (k == 1){
x <- stats::runif(1, min = -2.5, max = -0.5)
y <- -0.5 * x + stats::rnorm(1, mean = 0.2, sd = 0.2)
} else if (k == 2){
x <- stats::rnorm(1, mean = -3, sd = 0.5)
y <- stats::rnorm(1, mean = -2, sd = 0.3)
} else if (k == 3){
x <- stats::rnorm(1, mean = 0, sd = 0.5)
y <- x + -1 * stats::rnorm(1, mean = 1, sd = 0.5)
} else{
x <- stats::rnorm(1, mean = 2, sd = 0.5)
y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
}
c(x,y)
}))
}
.generate_clusters2 <- function(n){
t(sapply(1:n, function(i){
k <- sample(1:5, 1, prob = rep(1/5, 5))
if (k == 1) {
x <- stats::runif(1, min = -0.5, max = 2)
y <- -0.5 * x + stats::rnorm(1, mean = 1.25, sd = 0.1)
} else if (k == 2) {
x <- stats::rnorm(1, mean = -0.25, sd = 0.1)
y <- stats::rnorm(1, mean = 0, sd = 0.1)
} else if (k == 3) {
x <- stats::rnorm(1, mean = 1, sd = 0.75)
y <- stats::rnorm(1, mean = 0, sd = 0.1)
} else if (k == 4) {
x <- stats::runif(1, min = -0.5, max = 0.25)
y <- x + stats::rnorm(1, mean = 1.25, sd = 0.25)
} else {
x <- stats::rnorm(1, mean = 0.25, sd = 0.25)
y <- stats::rnorm(1, mean = 0.5, sd = 0.1)
}
c(x,y)
}))
}
plot(.generate_clusters_L(500))
plot(.generate_clusters2(500))
generate_data <- function(n, p){
stopifnot(p %% 2 == 0)
# generate p/2 2-dimensional datasets
dat_list <- lapply(1:(p/2), function(round){
type <- sample(1:7, 1)
if(type == 1){
.generate_lollipop(n)
} else if (type == 2) {
.generate_v(n)
} else if (type == 3) {
.generate_clusters_L(n)
} else if (type == 4) {
.generate_local(n)
} else if (type == 5) {
.generate_quadratic(n)
} else if (type == 6) {
.generate_clusters1(n)
} else {
.generate_clusters2(n)
}
})
dat_list <- lapply(dat_list, function(dat){
dat[order(dat[,1]),]
})
# concatenate column-wise
do.call(cbind, dat_list)
}
set.seed(10)
dat <- generate_data(500, 10)
pairs(dat)
shuffling_function <- function(vec, window = 5){
order_vec <- order(vec)
for(i in 1:(length(order_vec)-window)){
order_vec[i:(i+window)] <- sample(order_vec[i:(i+window)])
}
order_vec
}
dat1_A <- dat
new_order1 <- shuffling_function(dat1_A[,1], window = 10)
new_order3 <- shuffling_function(dat1_A[,3], window = 10)
new_order5 <- shuffling_function(dat1_A[,5], window = 10)
new_order7 <- shuffling_function(dat1_A[,7], window = 10)
new_order9 <- shuffling_function(dat1_A[,9], window = 10)
dat1_A_1 <- dat1_A
dat1_A_1[,c(1,2)] <- dat1_A[new_order1, c(1,2)]
dat1_A_2 <- dat1_A_1
dat1_A_2[,c(3,4)] <- dat1_A_1[new_order3, c(3,4)]
dat1_A_3 <- dat1_A_2
dat1_A_3[,c(5,6)] <- dat1_A_2[new_order5, c(5,6)]
dat1_A_4 <- dat1_A_3
dat1_A_4[,c(7,8)] <- dat1_A_3[new_order7, c(7,8)]
dat1_A_5 <- dat1_A_4
dat1_A_5[,c(9,10)] <- dat1_A_4[new_order9, c(9,10)]
for (elem in list(dat1_A, dat1_A_1, dat1_A_2, dat1_A_3, dat1_A_4, dat1_A_5)){
pairs(elem)
}
quadratic_ordering <- function(vec){
order_vec <- order(vec)
n <- length(order_vec)
odd_idx <- seq(1, n, by = 2)
even_idx <- seq(2, n, by = 2)
new_order_vec <- c(order_vec[odd_idx], rev(order_vec[even_idx]))
new_order_vec
}
dat_II <- dat
new_order_II_1 <- quadratic_ordering(dat_II[,1])
new_order_II_3 <- quadratic_ordering(dat_II[,3])
new_order_II_5 <- quadratic_ordering(dat_II[,5])
new_order_II_7 <- quadratic_ordering(dat_II[,7])
new_order_II_9 <- quadratic_ordering(dat_II[,9])
dat_II_1 <- dat_II
dat_II_1[,c(1,2)] <- dat_II[order(dat[,1]), c(1,2)]
dat_II_3 <- dat_II
dat_II_3[,c(3,4)] <- dat_II[new_order_II_3, c(3,4)]
dat_II_5 <- dat_II
dat_II_5[,c(5,6)] <- dat_II[new_order_II_5, c(5,6)]
dat_II_7 <- dat_II
dat_II_7[,c(7,8)] <- dat_II[new_order_II_7, c(7,8)]
dat_II_9 <- dat_II
dat_II_9[,c(9,10)] <- dat_II[new_order_II_9, c(9,10)]
for (i in list(dat_II_1, dat_II_3, dat_II_5, dat_II_7, dat_II_9)){
pairs(i)
}
set.seed(10)
cluster_label <- sample(1:7, 100, replace = T)
dat4 <- dat
for(k in 1:max(cluster_label)){
idx <- which(cluster_label == k)
dat4[idx,c(1,2)] <- dat4[idx[order(dat4[idx,1])], c(1,2)]
dat4[idx,c(7,8)] <- dat4[idx[order(dat4[idx,7])], c(7,8)]
}
pairs(dat4)
New method 1. Generating p/2 bi-variate datasets 2. reshuffle rows for each dataset of p/2 2.1 order all the rows to be sorted in terms of respective first variable (x).